Communication
- For Niraj TE
- Mut vs WT
- Batch: SE and PE
- Combine Regular Genes Expression (RNAseq data) with TE expression (better normalized expression)
Notes:
Code LOG:
- included annotation
- included merging
- used TPM normalization
- save seperate results into excel rather than csv to prevent gene name bug in excel
- print cut-offs
- export results in original order, not ordered by LFC nor FDR (more robust)
- export significant results, too (FDR and |LFC|, not including edges)
- better VolcanoPlot by setting Max Y to 50 (>50 set to 50)
- better html output, skip nb, use html
- both OneBigModel and SeperatedModels
- fixed row.names (gencode id) bug
- output both FPKM and TPM, 06/28/2019
- both shrinked LFC and raw LFC in same output Excel, and both plots
- add lib-size
- Pvalue -> FDR in Volcano-plot, ylim 0,50, xlim fixing, smaller dots
- optional function to aggregate multi-loci genes (e.g. his1, his2a, his2b, his3, his4)
- Use gray scale for MA/Volcano-plots
- Output COUNT.xlsx
- volcano plot: up/down count
- length in TPM and DEseq2 outputs after aggregating histone genes
- Reverse SampleNAme:RPKM
- hist log(count+1)
- added optinal density scatterplot between length and |LFC|
- fixed sig.xlsx index bug
Print Cut-offs
paste("FDR cut-off:", thresh_p)
## [1] "FDR cut-off: 0.05"
paste("Log2FC cut-off:", thresh_LFC)
## [1] "Log2FC cut-off: 1"
Read Data
df1 <- read.table('../featureCount/counts.gene_id.run1.pe.txt',
sep="\t", header=TRUE,
row.names = 1) # row.name in cts(matrix)
colnames(df1)
## [1] "Chr"
## [2] "Start"
## [3] "End"
## [4] "Strand"
## [5] "Length"
## [6] "...bam.run2_local.DelMut.PE.run2.bam"
## [7] "...bam.run2_local.FRT19A.CTRL.PE.run2.bam"
## [8] "...bam.run2_local.PointMut.PE.run2.bam"
## [9] "...bam.run2_local.W1118.CTRL.PE.run2.bam"
dim(df1)
## [1] 175 9
df2 <- read.table('../featureCount/counts.gene_id.run1.se.txt',
sep="\t", header=TRUE,
row.names = 1) # row.name in cts(matrix)
colnames(df2)
## [1] "Chr"
## [2] "Start"
## [3] "End"
## [4] "Strand"
## [5] "Length"
## [6] "...bam.run2_local.DelMut.SE.run2.bam"
## [7] "...bam.run2_local.FRT19A.CTRL.SE.run2.bam"
## [8] "...bam.run2_local.PointMut.SE.run2.bam"
## [9] "...bam.run2_local.W1118.CTRL.SE.run2.bam"
dim(df2)
## [1] 175 9
df <- merge(df1, df2, by=c(1,2,3,4,5))
row.names(df) <- df$Chr
head(df)
## Chr Start End Strand Length
## AY187768_BUT6 AY187768_BUT6 1 387 + 387
## AY313771_ISBU3 AY313771_ISBU3 1 993 + 993
## FBgn0000004_17.6 FBgn0000004_17.6 1 7439 + 7439
## FBgn0000005_297 FBgn0000005_297 1 6995 + 6995
## FBgn0000006_412 FBgn0000006_412 1 7567 + 7567
## FBgn0000007_1731 FBgn0000007_1731 1 4648 + 4648
## ...bam.run2_local.DelMut.PE.run2.bam
## AY187768_BUT6 4
## AY313771_ISBU3 15
## FBgn0000004_17.6 5945
## FBgn0000005_297 9477
## FBgn0000006_412 42344
## FBgn0000007_1731 70047
## ...bam.run2_local.FRT19A.CTRL.PE.run2.bam
## AY187768_BUT6 3
## AY313771_ISBU3 4
## FBgn0000004_17.6 845
## FBgn0000005_297 6271
## FBgn0000006_412 69945
## FBgn0000007_1731 2326
## ...bam.run2_local.PointMut.PE.run2.bam
## AY187768_BUT6 6
## AY313771_ISBU3 13
## FBgn0000004_17.6 5921
## FBgn0000005_297 10445
## FBgn0000006_412 33547
## FBgn0000007_1731 60339
## ...bam.run2_local.W1118.CTRL.PE.run2.bam
## AY187768_BUT6 0
## AY313771_ISBU3 11
## FBgn0000004_17.6 3732
## FBgn0000005_297 7325
## FBgn0000006_412 57063
## FBgn0000007_1731 1413
## ...bam.run2_local.DelMut.SE.run2.bam
## AY187768_BUT6 21
## AY313771_ISBU3 0
## FBgn0000004_17.6 6395
## FBgn0000005_297 5765
## FBgn0000006_412 16510
## FBgn0000007_1731 17574
## ...bam.run2_local.FRT19A.CTRL.SE.run2.bam
## AY187768_BUT6 33
## AY313771_ISBU3 0
## FBgn0000004_17.6 618
## FBgn0000005_297 3622
## FBgn0000006_412 22950
## FBgn0000007_1731 1118
## ...bam.run2_local.PointMut.SE.run2.bam
## AY187768_BUT6 19
## AY313771_ISBU3 0
## FBgn0000004_17.6 4149
## FBgn0000005_297 8894
## FBgn0000006_412 13598
## FBgn0000007_1731 16304
## ...bam.run2_local.W1118.CTRL.SE.run2.bam
## AY187768_BUT6 2
## AY313771_ISBU3 0
## FBgn0000004_17.6 3853
## FBgn0000005_297 11389
## FBgn0000006_412 17296
## FBgn0000007_1731 686
colnames(df)
## [1] "Chr"
## [2] "Start"
## [3] "End"
## [4] "Strand"
## [5] "Length"
## [6] "...bam.run2_local.DelMut.PE.run2.bam"
## [7] "...bam.run2_local.FRT19A.CTRL.PE.run2.bam"
## [8] "...bam.run2_local.PointMut.PE.run2.bam"
## [9] "...bam.run2_local.W1118.CTRL.PE.run2.bam"
## [10] "...bam.run2_local.DelMut.SE.run2.bam"
## [11] "...bam.run2_local.FRT19A.CTRL.SE.run2.bam"
## [12] "...bam.run2_local.PointMut.SE.run2.bam"
## [13] "...bam.run2_local.W1118.CTRL.SE.run2.bam"
dim(df)
## [1] 175 13
Clean Up Data
# clean names
colnames(df) <- gsub("\\.\\.\\.bam.run2_local.", "", colnames(df))
colnames(df) <- gsub(".run2.bam", "", colnames(df))
colnames(df) <- gsub("sorted_reads.", "", colnames(df))
# colname_formatted <- c(
# "KO_D0_3", "WT_D0_3", "KO_D2_3", "WT_D2_3", "KO_D8_3", "WT_D8_3",
# "KO_D0_1", "KO_D0_2", "KO_D2_1", "KO_D2_2", "KO_D8_1", "KO_D8_2",
# "WT_D0_1", "WT_D0_2", "WT_D2_1", "WT_D2_2", "WT_D8_1", "WT_D8_2")
# paste(colnames(df), colname_formatted, sep = "->")
# colnames(df) <- colname_formatted
df<-df[,c( "Chr","Start","End","Strand","Length",
"W1118.CTRL.PE","W1118.CTRL.SE",
"FRT19A.CTRL.PE", "FRT19A.CTRL.SE",
"DelMut.PE", "DelMut.SE",
"PointMut.PE" , "PointMut.SE" )]
colnames(df)
## [1] "Chr" "Start" "End" "Strand"
## [5] "Length" "W1118.CTRL.PE" "W1118.CTRL.SE" "FRT19A.CTRL.PE"
## [9] "FRT19A.CTRL.SE" "DelMut.PE" "DelMut.SE" "PointMut.PE"
## [13] "PointMut.SE"
head(df)
## Chr Start End Strand Length W1118.CTRL.PE
## AY187768_BUT6 AY187768_BUT6 1 387 + 387 0
## AY313771_ISBU3 AY313771_ISBU3 1 993 + 993 11
## FBgn0000004_17.6 FBgn0000004_17.6 1 7439 + 7439 3732
## FBgn0000005_297 FBgn0000005_297 1 6995 + 6995 7325
## FBgn0000006_412 FBgn0000006_412 1 7567 + 7567 57063
## FBgn0000007_1731 FBgn0000007_1731 1 4648 + 4648 1413
## W1118.CTRL.SE FRT19A.CTRL.PE FRT19A.CTRL.SE DelMut.PE
## AY187768_BUT6 2 3 33 4
## AY313771_ISBU3 0 4 0 15
## FBgn0000004_17.6 3853 845 618 5945
## FBgn0000005_297 11389 6271 3622 9477
## FBgn0000006_412 17296 69945 22950 42344
## FBgn0000007_1731 686 2326 1118 70047
## DelMut.SE PointMut.PE PointMut.SE
## AY187768_BUT6 21 6 19
## AY313771_ISBU3 0 13 0
## FBgn0000004_17.6 6395 5921 4149
## FBgn0000005_297 5765 10445 8894
## FBgn0000006_412 16510 33547 13598
## FBgn0000007_1731 17574 60339 16304
dim(df)
## [1] 175 13
Create cts
cts <- df[, 6:ncol(df)]
cts <- as.matrix(cts)
colnames(cts)
## [1] "W1118.CTRL.PE" "W1118.CTRL.SE" "FRT19A.CTRL.PE" "FRT19A.CTRL.SE"
## [5] "DelMut.PE" "DelMut.SE" "PointMut.PE" "PointMut.SE"
head(cts)
## W1118.CTRL.PE W1118.CTRL.SE FRT19A.CTRL.PE FRT19A.CTRL.SE
## AY187768_BUT6 0 2 3 33
## AY313771_ISBU3 11 0 4 0
## FBgn0000004_17.6 3732 3853 845 618
## FBgn0000005_297 7325 11389 6271 3622
## FBgn0000006_412 57063 17296 69945 22950
## FBgn0000007_1731 1413 686 2326 1118
## DelMut.PE DelMut.SE PointMut.PE PointMut.SE
## AY187768_BUT6 4 21 6 19
## AY313771_ISBU3 15 0 13 0
## FBgn0000004_17.6 5945 6395 5921 4149
## FBgn0000005_297 9477 5765 10445 8894
## FBgn0000006_412 42344 16510 33547 13598
## FBgn0000007_1731 70047 17574 60339 16304
paste("lib-size:")
## [1] "lib-size:"
colSums(cts)
## W1118.CTRL.PE W1118.CTRL.SE FRT19A.CTRL.PE FRT19A.CTRL.SE DelMut.PE
## 1633796 795150 591846 243766 4645652
## DelMut.SE PointMut.PE PointMut.SE
## 2908432 4595310 2819218
Filter Based on Expression
expression_filter <- rowSums(cts) >= 1 # default 10
cts <- cts[expression_filter, ]
df <- df[expression_filter, ]
dim(cts)
## [1] 159 8
dim(df)
## [1] 159 13
Read Anno
anno <- read.table(
"../../TEdata/Drosophila_melanogaster.BDGP6.22.96.CombineWith.dm6.transposon.anno.txt.gz",
header=T)
colnames(anno)
## [1] "Gene" "Name" "Type" "Length"
#anno <- anno[, c("peak_id", "seqnames", "start", "end", "width", "geneBiotype", "geneID", "geneName" )]
dim(anno)
## [1] 17933 4
head(anno)
## Gene Name Type Length
## 1 FBgn0000003 7SLRNA:CR32864 ncRNA 299
## 2 FBgn0000008 a protein_coding 5414
## 3 FBgn0000014 abd-A protein_coding 5477
## 4 FBgn0000015 Abd-B protein_coding 11791
## 5 FBgn0000017 Abl protein_coding 12819
## 6 FBgn0000018 abo protein_coding 1794
COUNT.TE output
count <- data.frame(cts)
colnames(count) <- paste(colnames(count),"COUNT", sep = ":")
count_out <- merge(anno, count, by.x=1, by.y=0, all.y=T, sort=F)
head(count_out)
## Gene Name Type Length
## 1 FBgn0026065_Idefix FBgn0026065_Idefix dm6.transposon 7411
## 2 FBgn0000004_17.6 FBgn0000004_17.6 dm6.transposon 7439
## 3 FBgn0000007_1731 FBgn0000007_1731 dm6.transposon 4648
## 4 FBgn0000005_297 FBgn0000005_297 dm6.transposon 6995
## 5 FBgn0005384_3S18 FBgn0005384_3S18 dm6.transposon 6126
## 6 FBgn0000006_412 FBgn0000006_412 dm6.transposon 7567
## W1118.CTRL.PE:COUNT W1118.CTRL.SE:COUNT FRT19A.CTRL.PE:COUNT
## 1 2640 533 2231
## 2 3732 3853 845
## 3 1413 686 2326
## 4 7325 11389 6271
## 5 10501 6263 24469
## 6 57063 17296 69945
## FRT19A.CTRL.SE:COUNT DelMut.PE:COUNT DelMut.SE:COUNT PointMut.PE:COUNT
## 1 429 22273 9938 20847
## 2 618 5945 6395 5921
## 3 1118 70047 17574 60339
## 4 3622 9477 5765 10445
## 5 9972 106540 39477 144444
## 6 22950 42344 16510 33547
## PointMut.SE:COUNT
## 1 7833
## 2 4149
## 3 16304
## 4 8894
## 5 41589
## 6 13598
WriteXLS(x = count_out,
ExcelFileName = 'COUNT.TE.xlsx', row.names = F, SheetNames = 'sheet1', na = '-')
Merge with RNAseq results
df3_rnaseq <- readxl::read_xlsx('../../../RNAseq_201906/post_snakemake/Mac200/liberal/COUNT.xlsx', na="-") # row.name in cts(matrix)
colnames(df3_rnaseq) <- gsub(":COUNT", "", colnames(df3_rnaseq))
df3_rnaseq <- df3_rnaseq[, c("Gene","Name","Type","Length",
"W1118.CTRL.PE", "W1118.CTRL.SE",
"FRT19A.CTRL.PE", "FRT19A.CTRL.SE",
"DelMut.PE", "DelMut.SE",
"PointMut.PE","PointMut.SE")]
colnames(df3_rnaseq)
## [1] "Gene" "Name" "Type" "Length"
## [5] "W1118.CTRL.PE" "W1118.CTRL.SE" "FRT19A.CTRL.PE" "FRT19A.CTRL.SE"
## [9] "DelMut.PE" "DelMut.SE" "PointMut.PE" "PointMut.SE"
anno_his <- subset(df3_rnaseq, Type == 'aggregated_multi-loci_gene', select=c(1,2,3,4))
df_TE <- readxl::read_xlsx('COUNT.TE.xlsx', na="-") # row.name in cts(matrix)
colnames(df_TE) <- gsub(":COUNT", "", colnames(df_TE))
colnames(df_TE)
## [1] "Gene" "Name" "Type" "Length"
## [5] "W1118.CTRL.PE" "W1118.CTRL.SE" "FRT19A.CTRL.PE" "FRT19A.CTRL.SE"
## [9] "DelMut.PE" "DelMut.SE" "PointMut.PE" "PointMut.SE"
colnames(df3_rnaseq) == colnames(df_TE)
## [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
df <- rbind(df_TE, df3_rnaseq)
head(df)
## # A tibble: 6 x 12
## Gene Name Type Length W1118.CTRL.PE W1118.CTRL.SE FRT19A.CTRL.PE
## <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 FBgn… FBgn… dm6.… 7411 2640 533 2231
## 2 FBgn… FBgn… dm6.… 7439 3732 3853 845
## 3 FBgn… FBgn… dm6.… 4648 1413 686 2326
## 4 FBgn… FBgn… dm6.… 6995 7325 11389 6271
## 5 FBgn… FBgn… dm6.… 6126 10501 6263 24469
## 6 FBgn… FBgn… dm6.… 7567 57063 17296 69945
## # … with 5 more variables: FRT19A.CTRL.SE <dbl>, DelMut.PE <dbl>,
## # DelMut.SE <dbl>, PointMut.PE <dbl>, PointMut.SE <dbl>
tail(df)
## # A tibble: 6 x 12
## Gene Name Type Length W1118.CTRL.PE W1118.CTRL.SE FRT19A.CTRL.PE
## <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 FBgn… FUM1 prot… 1861 4178 4979 4178
## 2 HIS1 HIS1 aggr… 23529 5281 136 3999
## 3 HIS2A HIS2A aggr… 9843 10930 182 10757
## 4 HIS2B HIS2B aggr… 13685 15569 239 18376
## 5 HIS3 HIS3 aggr… 12167 3055 7 2825
## 6 HIS4 HIS4 aggr… 9543 4387 118 4316
## # … with 5 more variables: FRT19A.CTRL.SE <dbl>, DelMut.PE <dbl>,
## # DelMut.SE <dbl>, PointMut.PE <dbl>, PointMut.SE <dbl>
get cts again
cts <- df[, 5:ncol(df)]
cts <- as.matrix(cts)
row.names(cts) <- df$Gene
colnames(cts)
## [1] "W1118.CTRL.PE" "W1118.CTRL.SE" "FRT19A.CTRL.PE" "FRT19A.CTRL.SE"
## [5] "DelMut.PE" "DelMut.SE" "PointMut.PE" "PointMut.SE"
head(cts)
## W1118.CTRL.PE W1118.CTRL.SE FRT19A.CTRL.PE
## FBgn0026065_Idefix 2640 533 2231
## FBgn0000004_17.6 3732 3853 845
## FBgn0000007_1731 1413 686 2326
## FBgn0000005_297 7325 11389 6271
## FBgn0005384_3S18 10501 6263 24469
## FBgn0000006_412 57063 17296 69945
## FRT19A.CTRL.SE DelMut.PE DelMut.SE PointMut.PE
## FBgn0026065_Idefix 429 22273 9938 20847
## FBgn0000004_17.6 618 5945 6395 5921
## FBgn0000007_1731 1118 70047 17574 60339
## FBgn0000005_297 3622 9477 5765 10445
## FBgn0005384_3S18 9972 106540 39477 144444
## FBgn0000006_412 22950 42344 16510 33547
## PointMut.SE
## FBgn0026065_Idefix 7833
## FBgn0000004_17.6 4149
## FBgn0000007_1731 16304
## FBgn0000005_297 8894
## FBgn0005384_3S18 41589
## FBgn0000006_412 13598
paste("lib-size:")
## [1] "lib-size:"
colSums(cts)
## W1118.CTRL.PE W1118.CTRL.SE FRT19A.CTRL.PE FRT19A.CTRL.SE DelMut.PE
## 27757124 28607428 27461516 28984435 26569878
## DelMut.SE PointMut.PE PointMut.SE
## 27853744 27904256 28454147
COUNT output
count <- data.frame(cts)
colnames(count) <- paste(colnames(count),"COUNT", sep = ":")
count_out <- merge(anno, count, by.x=1, by.y=0, all.y=T, sort=F)
head(count_out)
## Gene Name Type Length W1118.CTRL.PE:COUNT
## 1 FBgn0000003 7SLRNA:CR32864 ncRNA 299 857775
## 2 FBgn0000008 a protein_coding 5414 498
## 3 FBgn0000014 abd-A protein_coding 5477 205
## 4 FBgn0000015 Abd-B protein_coding 11791 371
## 5 FBgn0000017 Abl protein_coding 12819 974
## 6 FBgn0000018 abo protein_coding 1794 305
## W1118.CTRL.SE:COUNT FRT19A.CTRL.PE:COUNT FRT19A.CTRL.SE:COUNT
## 1 12 708677 18
## 2 250 527 465
## 3 568 176 767
## 4 757 275 384
## 5 1967 946 1359
## 6 291 362 306
## DelMut.PE:COUNT DelMut.SE:COUNT PointMut.PE:COUNT PointMut.SE:COUNT
## 1 749146 26 868286 21
## 2 481 183 473 211
## 3 134 424 144 404
## 4 171 253 123 300
## 5 1291 2001 1268 2324
## 6 247 264 229 252
WriteXLS(x = count_out,
ExcelFileName = 'COUNT.xlsx', row.names = F, SheetNames = 'sheet1', na = '-')
TPM calculation
tpm <- calculateTPM(cts, df$Length)
tpm <- data.frame(tpm)
colnames(tpm) <- paste(colnames(tpm),"TPM", sep = ":")
tpm_out <- merge(anno, tpm, by.x=1, by.y=0, all.y=T, sort=F)
head(tpm_out)
## Gene Name Type Length W1118.CTRL.PE:TPM
## 1 FBgn0000003 7SLRNA:CR32864 ncRNA 299 1.466959e+05
## 2 FBgn0000008 a protein_coding 5414 4.703561e+00
## 3 FBgn0000014 abd-A protein_coding 5477 1.913933e+00
## 4 FBgn0000015 Abd-B protein_coding 11791 1.608937e+00
## 5 FBgn0000017 Abl protein_coding 12819 3.885263e+00
## 6 FBgn0000018 abo protein_coding 1794 8.693468e+00
## W1118.CTRL.SE:TPM FRT19A.CTRL.PE:TPM FRT19A.CTRL.SE:TPM DelMut.PE:TPM
## 1 2.788843 1.272681e+05 4.024877 1.448736e+05
## 2 3.208749 5.226789e+00 5.742302 5.137135e+00
## 3 7.206421 1.725490e+00 9.362761 1.414673e+00
## 4 4.461278 1.252347e+00 2.177368 8.385705e-01
## 5 10.662628 3.962594e+00 7.087885 5.823260e+00
## 6 11.271575 1.083499e+01 11.403819 7.961019e+00
## DelMut.SE:TPM PointMut.PE:TPM PointMut.SE:TPM
## 1 6.483794 1.534252e+05 5.031000
## 2 2.520344 4.615810e+00 2.791711
## 3 5.772317 1.389072e+00 5.283781
## 4 1.599915 5.511369e-01 1.822539
## 5 11.639116 5.226009e+00 12.986381
## 6 10.972574 6.744008e+00 10.061999
tail(tpm_out)
## Gene Name Type
## 16678 FBgn0063755_Osvaldo FBgn0063755_Osvaldo dm6.transposon
## 16679 HIS1 HIS1 aggregated_multi-loci_gene
## 16680 HIS2A HIS2A aggregated_multi-loci_gene
## 16681 HIS2B HIS2B aggregated_multi-loci_gene
## 16682 HIS3 HIS3 aggregated_multi-loci_gene
## 16683 HIS4 HIS4 aggregated_multi-loci_gene
## Length W1118.CTRL.PE:TPM W1118.CTRL.SE:TPM FRT19A.CTRL.PE:TPM
## 16678 1543 0.463957 1.57621759 0.3479979
## 16679 23529 11.477000 0.40165159 9.1262112
## 16680 9843 56.781694 1.28486635 58.6821834
## 16681 13685 58.174356 1.21357648 72.1022407
## 16682 12167 12.839360 0.03997869 12.4674468
## 16683 9543 23.507063 0.85923337 24.2850551
## FRT19A.CTRL.SE:TPM DelMut.PE:TPM DelMut.SE:TPM PointMut.PE:TPM
## 16678 0.04332967 8.39414005 9.76140836 9.10795186
## 16679 0.47168921 0.01965987 0.06021118 0.09206306
## 16680 1.75244157 30.89956586 1.21204721 32.72069271
## 16681 1.15297138 31.18634810 1.59642991 31.42953597
## 16682 0.03297001 6.36342878 0.11643864 7.80748566
## 16683 1.24005135 17.55931525 0.73446308 20.87187391
## PointMut.SE:TPM
## 16678 7.659919365
## 16679 0.009133221
## 16680 0.589472723
## 16681 1.177725006
## 16682 0.076536046
## 16683 0.105087072
WriteXLS(x = tpm_out,
ExcelFileName = 'TPM.xlsx', row.names = F, SheetNames = 'sheet1', na = '-')
FPKKM calculation
fpkm <- calculateFPKM(cts, df$Length)
fpkm <- data.frame(fpkm)
colnames(fpkm) <- paste(colnames(fpkm), "FPKM", sep = ":")
fpkm_out <- merge(anno, fpkm, by.x=1, by.y=0, all.y=T, sort=F)
head(fpkm_out)
## Gene Name Type Length W1118.CTRL.PE:FPKM
## 1 FBgn0000003 7SLRNA:CR32864 ncRNA 299 1.033541e+05
## 2 FBgn0000008 a protein_coding 5414 3.313879e+00
## 3 FBgn0000014 abd-A protein_coding 5477 1.348456e+00
## 4 FBgn0000015 Abd-B protein_coding 11791 1.133571e+00
## 5 FBgn0000017 Abl protein_coding 12819 2.737350e+00
## 6 FBgn0000018 abo protein_coding 1794 6.124955e+00
## W1118.CTRL.SE:FPKM FRT19A.CTRL.PE:FPKM FRT19A.CTRL.SE:FPKM
## 1 1.402915 86308.315631 2.077000
## 2 1.614146 3.544605 2.963261
## 3 3.625157 1.170160 4.831565
## 4 2.244225 0.849293 1.123610
## 5 5.363785 2.687277 3.657636
## 6 5.670113 7.347873 5.884833
## DelMut.PE:FPKM DelMut.SE:FPKM PointMut.PE:FPKM PointMut.SE:FPKM
## 1 9.429870e+04 3.1218971 1.040689e+05 2.4683261
## 2 3.343777e+00 1.2135265 3.130923e+00 1.3696785
## 3 9.208152e-01 2.7793263 9.422135e-01 2.5923465
## 4 5.458281e-01 0.7703468 3.738385e-01 0.8941802
## 5 3.790378e+00 5.6041450 3.544824e+00 6.3714225
## 6 5.181851e+00 5.2832104 4.574489e+00 4.9366522
tail(fpkm_out)
## Gene Name Type
## 16678 FBgn0063755_Osvaldo FBgn0063755_Osvaldo dm6.transposon
## 16679 HIS1 HIS1 aggregated_multi-loci_gene
## 16680 HIS2A HIS2A aggregated_multi-loci_gene
## 16681 HIS2B HIS2B aggregated_multi-loci_gene
## 16682 HIS3 HIS3 aggregated_multi-loci_gene
## 16683 HIS4 HIS4 aggregated_multi-loci_gene
## Length W1118.CTRL.PE:FPKM W1118.CTRL.SE:FPKM FRT19A.CTRL.PE:FPKM
## 16678 1543 0.3268795 0.7929089 0.2359987
## 16679 23529 8.0860837 0.2020490 6.1890419
## 16680 9843 40.0053626 0.6463460 39.7959773
## 16681 13685 40.9865583 0.6104840 48.8969389
## 16682 12167 9.0459304 0.0201111 8.4549382
## 16683 9543 16.5618270 0.4322333 16.4691810
## FRT19A.CTRL.SE:FPKM DelMut.PE:FPKM DelMut.SE:FPKM PointMut.PE:FPKM
## 16678 0.02235987 5.46377154 4.70004335 6.17796243
## 16679 0.24341076 0.01279667 0.02899122 0.06244676
## 16680 0.90433092 20.11262234 0.58359145 22.19458483
## 16681 0.59497999 20.29928978 0.76866877 21.31878773
## 16682 0.01701386 4.14197534 0.05606431 5.29585068
## 16683 0.63991679 11.42941223 0.35363835 14.15748072
## PointMut.SE:FPKM
## 16678 3.758135610
## 16679 0.004480972
## 16680 0.289209106
## 16681 0.577819435
## 16682 0.037550375
## 16683 0.051558176
WriteXLS(x = fpkm_out,
ExcelFileName = 'FPKM.xlsx', row.names = F, SheetNames = 'sheet1', na = '-')
DESeq Experiment Design (One Big Model)
- for QC and comparison with seperated model only
sample <- factor(rep(c("W1118", "FRT19A", "DelMut","PointMut"), each=2))
sample
## [1] W1118 W1118 FRT19A FRT19A DelMut DelMut PointMut PointMut
## Levels: DelMut FRT19A PointMut W1118
batch <- factor(rep(c("PE", "SE"), 4))
batch
## [1] PE SE PE SE PE SE PE SE
## Levels: PE SE
type <- factor(paste(sample, batch, sep = "_"))
type
## [1] W1118_PE W1118_SE FRT19A_PE FRT19A_SE DelMut_PE DelMut_SE
## [7] PointMut_PE PointMut_SE
## 8 Levels: DelMut_PE DelMut_SE FRT19A_PE FRT19A_SE ... W1118_SE
#mouse <- factor(rep(c("Mouse1", "Mouse2", "Mouse3"), 6))
coldata <- data.frame(row.names=colnames(cts),
sample
, batch
, type
)
coldata
## sample batch type
## W1118.CTRL.PE W1118 PE W1118_PE
## W1118.CTRL.SE W1118 SE W1118_SE
## FRT19A.CTRL.PE FRT19A PE FRT19A_PE
## FRT19A.CTRL.SE FRT19A SE FRT19A_SE
## DelMut.PE DelMut PE DelMut_PE
## DelMut.SE DelMut SE DelMut_SE
## PointMut.PE PointMut PE PointMut_PE
## PointMut.SE PointMut SE PointMut_SE
Model fitting
dds <- DESeqDataSetFromMatrix(countData = cts,
colData = coldata,
design = ~ batch + sample
) # converted to alph-order
## converting counts to integer mode
# dds$type <- relevel(dds$type,
# ref = "DMSO_2D"
# )
# dds$mouse <- relevel(dds$mouse,
# ref = "Mouse1"
# )
dds
## class: DESeqDataSet
## dim: 16683 8
## metadata(1): version
## assays(1): counts
## rownames(16683): FBgn0026065_Idefix FBgn0000004_17.6 ... HIS3 HIS4
## rowData names(0):
## colnames(8): W1118.CTRL.PE W1118.CTRL.SE ... PointMut.PE
## PointMut.SE
## colData names(3): sample batch type
dds <-DESeq(dds)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
resultsNames(dds)
## [1] "Intercept" "batch_SE_vs_PE"
## [3] "sample_FRT19A_vs_DelMut" "sample_PointMut_vs_DelMut"
## [5] "sample_W1118_vs_DelMut"
saveRDS(dds, file = 'dds.oneBigModel.rds')
QC Plots
Histogram of Log10(Counts)
hist(logCounts, main='log10(count+1)')

Dispersion plot
plotDispEsts(dds, main="Dispersion plot")

PCA plots
pcaData <- plotPCA(vsd, intgroup=c("sample", "batch"), returnData=TRUE)
percentVar <- round(100 * attr(pcaData, "percentVar"))
ggplot(pcaData, aes(PC1, PC2, color=batch, shape=sample)) +
geom_point(size=3) +
xlab(paste0("PC1: ",percentVar[1],"% variance")) +
ylab(paste0("PC2: ",percentVar[2],"% variance")) +
coord_fixed()

With label
library(ggplot2)
library(ggrepel)
pcaplot <- ggplot(pcaData, aes(PC1, PC2, color=batch, shape=sample)) +
geom_point(size=3) +
#geom_text(aes(label=type),hjust=1, vjust=0) +
xlab(paste0("PC1: ",percentVar[1],"% variance")) +
ylab(paste0("PC2: ",percentVar[2],"% variance")) +
coord_fixed() +
coord_fixed(ratio = percentVar[1]/percentVar[2]/2,
xlim = NULL, ylim = NULL, expand = T,
clip = "off")
## Coordinate system already present. Adding new coordinate system, which will replace the existing one.
### geom_label_repel
pcaplot +
geom_label_repel(aes(label = type),
box.padding = 0.35,
point.padding = 0.5,
segment.color = 'grey50') +
theme_classic()

Sample Heatmap
library("RColorBrewer")
library('pheatmap')
library("PoiClaClu")
colors <- colorRampPalette( rev(brewer.pal(9, "Blues")) )(255)
poisd <- PoissonDistance(t(counts(dds)))
samplePoisDistMatrix <- as.matrix( poisd$dd )
rownames(samplePoisDistMatrix) <- paste( coldata$type, sep="-" )
colnames(samplePoisDistMatrix) <- NULL
colors <- colorRampPalette( rev(brewer.pal(9, "Blues")) )(255)
pheatmap(samplePoisDistMatrix,
clustering_distance_rows=poisd$dd,
clustering_distance_cols=poisd$dd,
col=colors,
clustering_method='complete'
)

Results from different contrasts
One Big Model (DelMut_vs_FRT19A)
#resultsNames(dds)
name <- "DelMut_vs_FRT19A"
contrast <- c('sample', 'DelMut', 'FRT19A')
res <- lfcShrink(dds, contrast = contrast, type = 'ashr')
## using 'ashr' for LFC shrinkage. If used in published research, please cite:
## Stephens, M. (2016) False discovery rates: a new deal. Biostatistics, 18:2.
## https://doi.org/10.1093/biostatistics/kxw041
res2 <- results(dds, contrast = contrast)
process_deseq_res(res = res, res2=res2, name = name, anno = anno, norm_exp = tpm)
## [1] "DelMut_vs_FRT19A"
## [1] "\n>>> Summary using FDR cut-off only (LFC not used)"
##
## out of 16683 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up) : 1749, 10%
## LFC < 0 (down) : 2134, 13%
## outliers [1] : 0, 0%
## low counts [2] : 3235, 19%
## (mean count < 5)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
##
## [1] "\n>>> Summary using both FDR and LFC_shrinked cut-off"
## sig_idx
## FALSE TRUE
## 15170 1513
## up_idx
## FALSE TRUE
## 15683 1000
## down_idx
## FALSE TRUE
## 16170 513






One Big Model (PointMut_vs_FRT19A)
#resultsNames(dds)
name <- "PointMut_vs_FRT19A"
contrast <- c('sample', 'PointMut', 'FRT19A')
res <- lfcShrink(dds, contrast = contrast, type = 'ashr')
## using 'ashr' for LFC shrinkage. If used in published research, please cite:
## Stephens, M. (2016) False discovery rates: a new deal. Biostatistics, 18:2.
## https://doi.org/10.1093/biostatistics/kxw041
res2 <- results(dds, contrast = contrast)
process_deseq_res(res = res, res2=res2, name = name, anno = anno, norm_exp = tpm)
## [1] "PointMut_vs_FRT19A"
## [1] "\n>>> Summary using FDR cut-off only (LFC not used)"
##
## out of 16683 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up) : 1736, 10%
## LFC < 0 (down) : 1986, 12%
## outliers [1] : 0, 0%
## low counts [2] : 3235, 19%
## (mean count < 5)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
##
## [1] "\n>>> Summary using both FDR and LFC_shrinked cut-off"
## sig_idx
## FALSE TRUE
## 15182 1501
## up_idx
## FALSE TRUE
## 15607 1076
## down_idx
## FALSE TRUE
## 16258 425






One Big Model (DelMut_vs_W1118)
#resultsNames(dds)
name <- "DelMut_vs_W1118"
contrast <- c('sample', 'DelMut', 'W1118')
res <- lfcShrink(dds, contrast = contrast, type = 'ashr')
## using 'ashr' for LFC shrinkage. If used in published research, please cite:
## Stephens, M. (2016) False discovery rates: a new deal. Biostatistics, 18:2.
## https://doi.org/10.1093/biostatistics/kxw041
res2 <- results(dds, contrast = contrast)
process_deseq_res(res = res, res2=res2, name = name, anno = anno, norm_exp = tpm)
## [1] "DelMut_vs_W1118"
## [1] "\n>>> Summary using FDR cut-off only (LFC not used)"
##
## out of 16683 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up) : 1530, 9.2%
## LFC < 0 (down) : 1797, 11%
## outliers [1] : 0, 0%
## low counts [2] : 3235, 19%
## (mean count < 5)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
##
## [1] "\n>>> Summary using both FDR and LFC_shrinked cut-off"
## sig_idx
## FALSE TRUE
## 15571 1112
## up_idx
## FALSE TRUE
## 15970 713
## down_idx
## FALSE TRUE
## 16284 399






One Big Model (PointMut_vs_W1118)
#resultsNames(dds)
name <- "PointMut_vs_W1118"
contrast <- c('sample', 'PointMut', 'W1118')
res <- lfcShrink(dds, contrast = contrast, type = 'ashr')
## using 'ashr' for LFC shrinkage. If used in published research, please cite:
## Stephens, M. (2016) False discovery rates: a new deal. Biostatistics, 18:2.
## https://doi.org/10.1093/biostatistics/kxw041
res2 <- results(dds, contrast = contrast)
process_deseq_res(res = res, res2=res2, name = name, anno = anno, norm_exp = tpm)
## [1] "PointMut_vs_W1118"
## [1] "\n>>> Summary using FDR cut-off only (LFC not used)"
##
## out of 16683 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up) : 1465, 8.8%
## LFC < 0 (down) : 1727, 10%
## outliers [1] : 0, 0%
## low counts [2] : 3882, 23%
## (mean count < 9)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
##
## [1] "\n>>> Summary using both FDR and LFC_shrinked cut-off"
## sig_idx
## FALSE TRUE
## 15597 1086
## up_idx
## FALSE TRUE
## 15965 718
## down_idx
## FALSE TRUE
## 16315 368





